The attached dataset contains observations about AirQuality in New York. The data is already split in training and test set.
Your task is:
if (rstudioapi::isAvailable())
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
library(data.table)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(caret)
## Loading required package: lattice
train_data <- read.csv("data/airquality.training.csv")
test_data <- read.csv("data/airquality.test.csv")
head(train_data) # we visualise the first rows from our train data
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
str(train_data)
## 'data.frame': 122 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 8 7 16 ...
## $ Solar.R: int 190 118 149 313 NA NA 299 19 NA 256 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 20.1 6.9 9.7 ...
## $ Temp : int 67 72 74 62 56 66 65 61 74 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 9 11 12 ...
dim(train_data) # it has 122 rows and 6 columns
## [1] 122 6
dim(test_data) # test dataset has 31 rows
## [1] 31 6
str(train_data)
## 'data.frame': 122 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 8 7 16 ...
## $ Solar.R: int 190 118 149 313 NA NA 299 19 NA 256 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 20.1 6.9 9.7 ...
## $ Temp : int 67 72 74 62 56 66 65 61 74 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 9 11 12 ...
na_train <- sum(is.na(train_data))
na_train # there are 34 NA values in our train data
## [1] 34
na_test <- sum(is.na(test_data))
na_test # there are 10 Na values in test data
## [1] 10
# distribution of the whole dataset into train and test
whole_data <- nrow(rbind(train_data, test_data))
print(paste("The ration Train/Test is aproximatelly", round(nrow(train_data)/whole_data*100), "/", round(nrow(test_data)/whole_data*100))) # 80/20
## [1] "The ration Train/Test is aproximatelly 80 / 20"
# let's get rid of NA values
train_data <- na.omit(train_data)
test_data <- na.omit(test_data)
dim(train_data) # now train dataset has only 90 rows, from 122
## [1] 90 6
dim(test_data) # test dataset has 21 rows, from 31
## [1] 21 6
# check for correlation, then visualise it
cor(train_data[,1:4])
## Ozone Solar.R Wind Temp
## Ozone 1.0000000 0.3356398 -0.5957313 0.7216995
## Solar.R 0.3356398 1.0000000 -0.1374307 0.3101187
## Wind -0.5957313 -0.1374307 1.0000000 -0.5055036
## Temp 0.7216995 0.3101187 -0.5055036 1.0000000
# Ozone and Temperature have a very strong positive correlation (0.72)
# Ozone and Wind have a normal negative correlation (-0.59)
# Ozone and Solar Radiation have a weak positive correlation (0.35)
a <- plot_ly(train_data, x = ~Wind, y = ~Ozone, type = "scatter", mode = "markers", name = "Wind vs. Ozone") # kind of a negative correlation
b <- plot_ly(train_data, x = ~Solar.R, y = ~Ozone, type = "scatter", mode = "markers", name = "Solar Radiation vs. Ozone") # not really a thing here
c <- plot_ly(train_data, x = ~Temp, y = ~Ozone, type = "scatter", mode = "markers", name = "Temperature vs. Ozone") # a slighty positive correlation
subplot(a,b,c, nrows = 2, titleY = T, margin = 0.05)
# some bloxplots with the variables
box <- plot_ly(data = train_data, y= ~Ozone, type = "box", name = "Ozone")
box <- box %>% add_trace(data = train_data, y = ~ Solar.R, name = "Solar Radiation")
box <- box %>% add_trace(data = train_data, y = ~ Wind, name = "Wind")
box <- box %>% add_trace(data = train_data, y = ~ Temp, name = "Temperature")
box
# Solar Radiation has a wide range of values, and Ozone has 2 outliers
# evolution over time
train_data$Date <- as.Date(paste("2000", train_data$Month, train_data$Day, sep = "-"))
test_data$Date <- as.Date(paste("2000", test_data$Month, test_data$Day, sep = "-"))
# plot the data
plot <- plot_ly() %>%
add_lines(data = train_data, x = ~Date, y = ~Ozone, name = "Training Data", line = list(color = 'blue')) %>%
add_lines(data = test_data, x = ~Date, y = ~Ozone, name = "Test Data", line = list(color = 'red')) %>%
layout(title = "Ozone Levels Over Time",
xaxis = list(title = "Month-Day"),
yaxis = list(title = "Ozone Levels"))
plot
The goal is to train a regression model that predicts Ozone based on Wind, Solar.R and Temp. Do not use the variables Month and Day. In particular:
# create the polynomial regression for multiple degrees values (1 to 5)
poly_regr <- list()
RMSE_train <- numeric(5)
training.actuals <- train_data$Ozone
for (i in 1:5) {
poly_regr[[i]] <- lm(Ozone ~ poly(Wind, i) + poly(Solar.R, i) + poly(Temp, i), data = train_data) # we create a polynomial regression for each degree
training.predictions <- poly_regr[[i]]$fitted.values # we predict the values for the training data
RMSE_train[[i]] <- RMSE(training.predictions, training.actuals) # we calculate the RMSE for each degree
}
plot_ly(x = 1:5, y = RMSE_train, type = 'scatter', mode = 'line') %>% layout(title = 'RMSE for different degrees', xaxis = list(title = 'Degree'), yaxis = list(title = 'RMSE'))
# we plot the RMSE for each degree
# now we can compare training and test data
RMSE_test <- numeric(5)
test.actuals <- test_data$Ozone
# we will use also the variables and the regression from the previous snippet chunk, so we will not repeat them
for (i in 1:5) {
test.predictions <- predict(poly_regr[[i]], test_data[, c("Wind", "Solar.R", "Temp")])
RMSE_test[[i]] <- RMSE(test.predictions, test.actuals)
}
plot_ly(x = 1:5, y = RMSE_train, type = 'scatter', mode = 'line', name = 'Training') %>%
add_trace(x = 1:5, y = RMSE_test, mode = 'line', name = 'Test') %>%
layout(title = 'RMSE for different degrees', xaxis = list(title = 'Degree'), yaxis = list(title = 'RMSE'))
Based on the plot that includes the RMSE for training and testing data, the best degree would be 3. It has the lowest RMSE while the RMSE values being the closest, and with less complexity. I am thinking that at value 5 the model is still not overfitting, with smaller RMSE values, but it has a big complexity, so I would choose 3.
The other thing is that, for some reason, my final plot is not similar to the one that my colleague showed on the last lecture, where the RMSE for the test data was smaller that train data.
I looked over the code and I couldn’t find any mistake, so I am not sure what is happening. A feedback on this matter would be appreciated.